In [1]:
import os
import glob
import pandas as pd
In [2]:
URL = 'https://github.com/data-is-plural/mls-salaries/raw/master/csvs/mls-salaries-'
In [3]:
years = range(2007,2017)
In [4]:
from urllib import urlretrieve
for year in years:
year = str(year)
urlretrieve((URL + str(year) + ".csv"), (year + '_salaries.csv'))
In [5]:
path = os.getcwd()
all_files = glob.glob(os.path.join(path, "*.csv"))
In [6]:
data = pd.DataFrame()
for csv in all_files:
for year in years:
if str(year) in str(csv):
print year
season = pd.read_csv(csv)
season['season'] = year
season['player'] = season['first_name'] + " " + season['last_name']
data = pd.concat([data, season])
In [7]:
data.head()
Out[7]:
In [8]:
%matplotlib inline
In [9]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [10]:
data.groupby(data.season).median().plot(legend = False);
In [11]:
pivoted = data.pivot_table('base_salary',index = data.season, columns = data.player, aggfunc='sum')
pivoted.iloc[:5, :5]
Out[11]:
In [12]:
ax = pivoted.plot(legend = False, alpha = 0.05);
ax.set_ylim(0, 500000);
In [ ]: